Import Data¶

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
import re

import statsmodels.api as sm
from pylab import *
homestay = pd.read_csv('AB_NYC_2019.csv')
In [2]:
homestay.head()
Out[2]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [3]:
len(homestay) # check amount of rows
Out[3]:
48895

I. Data preprocessing¶

a. Deleting redundant features¶

In [4]:
homestay.dtypes # check types of the columns
Out[4]:
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object
In [5]:
homestay.drop(['id','host_name','last_review'], axis=1, inplace=True) # Drop the 'host_name' not only because it is insignificant for analysis but also for ethical reasons. 

b. Data cleaning¶

i. Missing values¶

In [6]:
homestay.isnull().sum()# check missing values
Out[6]:
name                                 16
host_id                               0
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64
In [7]:
homestay.fillna({'reviews_per_month':0},inplace=True) # If there were no reviews for the listing, review_per_month"  simply will not exist. So, we can simply append it with 0.0 for missing values.
homestay.isnull().sum() # recheck missing values
Out[7]:
name                              16
host_id                            0
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
dtype: int64

ii. Checking duplicates¶

In [8]:
homestay.duplicated().sum()
Out[8]:
0

c. Feature engineering¶

In [9]:
homestay["price"].describe() 
# The numerical distribution of the price reveals a mean value of 153.1. However, the price range extends from 0 to 10000, indicating the presence of some outliers.
Out[9]:
count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64
In [10]:
homestay['price'].hist() # most of them are less than 1000
In [11]:
homestay["price"][homestay["price"]<400].hist()
In [12]:
homestay = homestay[homestay["price"]<=400]
In [13]:
homestay["price"].describe()
Out[13]:
count    47132.000000
mean       126.405202
std         78.172200
min          0.000000
25%         67.000000
50%        100.000000
75%        165.000000
max        400.000000
Name: price, dtype: float64
In [14]:
homestay['minimum_nights'].describe()
Out[14]:
count    47132.000000
mean         6.974391
std         20.343423
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max       1250.000000
Name: minimum_nights, dtype: float64
In [15]:
homestay['minimum_nights'].hist()
Out[15]:
<AxesSubplot: >
In [16]:
homestay['minimum_nights'][homestay['minimum_nights']<100].hist()
Out[16]:
<AxesSubplot: >
In [17]:
homestay.loc[(homestay.minimum_nights >30),'minimum_nights'] = 30
homestay['minimum_nights'][homestay['minimum_nights']<30].hist() # better distribution
Out[17]:
<AxesSubplot: >

II. Data exploration and visualization¶

a. Categorical variables counts¶

In [18]:
for column in homestay.select_dtypes(include = object):
    print(column,'counts:\n',homestay[column].value_counts(dropna = False))
    print('\n\n')
name counts:
 Hillside Hotel                                        18
Home away from home                                   17
NaN                                                   16
New york Multi-unit building                          14
Brooklyn Apartment                                    12
                                                      ..
Unique 2BR Apartment                                   1
STUNNING ONE BEDROOM IN THE HEART OF NEW YORK CITY     1
One bedroom in Beautiful Astoria with balcony!         1
Elegantly designed 1bd room apt                        1
Trendy duplex in the very heart of Hell's Kitchen      1
Name: name, Length: 46177, dtype: int64



neighbourhood_group counts:
 Manhattan        20366
Brooklyn         19712
Queens            5612
Bronx             1077
Staten Island      365
Name: neighbourhood_group, dtype: int64



neighbourhood counts:
 Williamsburg          3823
Bedford-Stuyvesant    3667
Harlem                2617
Bushwick              2454
Upper West Side       1867
                      ... 
Silver Lake              2
Richmondtown             1
New Dorp                 1
Rossville                1
Willowbrook              1
Name: neighbourhood, Length: 219, dtype: int64



room_type counts:
 Entire home/apt    23874
Private room       22110
Shared room         1148
Name: room_type, dtype: int64



b. Who owns more listings?¶

In [19]:
top_host=homestay.host_id.value_counts().head(10)

top_host_df=pd.DataFrame(top_host)
top_host_df.reset_index(inplace=True)
top_host_df.rename(columns={'index':'Host_ID', 'host_id':'Count'}, inplace=True)
top_host_df = top_host_df.sort_values(by='Count', ascending=False)  # count decending rank

plt.figure(figsize=(14, 10))
viz_1 = sns.barplot(x="Host_ID", y="Count", data=top_host_df, order=top_host_df.sort_values('Count', ascending=False).Host_ID, palette='Blues_d')

viz_1.set_title('Hosts with the most listings in NYC',fontsize = 20)
viz_1.set_ylabel('Count of listings',fontsize = 14)
viz_1.set_xlabel('Host IDs')
viz_1.set_xticklabels(viz_1.get_xticklabels(), rotation=45)
plt.xticks(fontsize=12) 
plt.yticks(fontsize=12)

for p in viz_1.patches:
    viz_1.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 5), 
                   textcoords = 'offset points')

    

plt.show()

c. Which borough is more expensive?¶

In [20]:
neighbourhood_counts = homestay['neighbourhood_group'].value_counts()
plt.figure(figsize=(8, 6))
neighbourhood_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title("Neighbourhood Group Counts")
plt.show()
In [21]:
sub_1 = homestay.loc[homestay['neighbourhood_group'] == 'Brooklyn']
price_sub1 = sub_1[['price']]

sub_2 = homestay.loc[homestay['neighbourhood_group'] == 'Manhattan']
price_sub2 = sub_2[['price']]

sub_3 = homestay.loc[homestay['neighbourhood_group'] == 'Queens']
price_sub3 = sub_3[['price']]

sub_4 = homestay.loc[homestay['neighbourhood_group'] == 'Staten Island']
price_sub4 = sub_4[['price']]

sub_5 = homestay.loc[homestay['neighbourhood_group'] == 'Bronx']
price_sub5 = sub_5[['price']]

price_list_by_n = [price_sub1, price_sub2, price_sub3, price_sub4, price_sub5]
In [22]:
p_l_b_n_2=[]
nei_list=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']

for x in price_list_by_n:
    i=x.describe(percentiles=[.25, .50, .75])
    i=i.iloc[3:]
    i.reset_index(inplace=True)
    i.rename(columns={'index':'Stats'}, inplace=True)
    p_l_b_n_2.append(i)

p_l_b_n_2[0].rename(columns={'price':nei_list[0]}, inplace=True)
p_l_b_n_2[1].rename(columns={'price':nei_list[1]}, inplace=True)
p_l_b_n_2[2].rename(columns={'price':nei_list[2]}, inplace=True)
p_l_b_n_2[3].rename(columns={'price':nei_list[3]}, inplace=True)
p_l_b_n_2[4].rename(columns={'price':nei_list[4]}, inplace=True)

stat_df=p_l_b_n_2
stat_df=[df.set_index('Stats') for df in stat_df]
stat_df=stat_df[0].join(stat_df[1:])
stat_df
Out[22]:
Brooklyn Manhattan Queens Staten Island Bronx
Stats
min 0.0 0.0 10.0 13.0 0.0
25% 60.0 90.0 50.0 50.0 45.0
50% 90.0 140.0 75.0 75.0 65.0
75% 145.0 200.0 110.0 105.0 98.0
max 400.0 400.0 400.0 300.0 399.0
In [23]:
# using violinplot to showcase density and distribtuion of prices 
homestay0 = homestay[homestay.price < 500]
viz_2=sns.violinplot(data=homestay0, x='neighbourhood_group', y='price')
viz_2.set_title('Density and distribution of prices for each neighbourhood_group')

d. Diving into the room types¶

In [24]:
room_type_counts = homestay['room_type'].value_counts()

plt.figure(figsize=(8, 6))
room_type_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title("Room Type Counts")

plt.show()
In [25]:
#using violinplot to showcase density and distribtuion of prices 
viz_2=sns.violinplot(data=homestay0, x='room_type', y='price')
viz_2.set_title('Density and distribution of prices for each room type')
In [26]:
homestay.neighbourhood.value_counts().head(10)
Out[26]:
Williamsburg          3823
Bedford-Stuyvesant    3667
Harlem                2617
Bushwick              2454
Upper West Side       1867
Hell's Kitchen        1834
East Village          1774
Upper East Side       1724
Crown Heights         1538
Midtown               1301
Name: neighbourhood, dtype: int64
In [27]:
sub_7 = homestay.loc[homestay['neighbourhood'].isin(['Williamsburg', 'Bedford-Stuyvesant', 'Harlem', 'Bushwick', 'Upper West Side', 'Hell\'s Kitchen', 'East Village', 'Upper East Side', 'Crown Heights', 'Midtown'])]
viz_3 = sns.catplot(x='neighbourhood', hue='neighbourhood_group', col='room_type', data=sub_7, kind='count')
viz_3.set_xticklabels(rotation=70) 
plt.show()
plt.figure(figsize=(20, 10))  
<Figure size 2000x1000 with 0 Axes>
In [28]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Filter non-strings
name_list = [str(name) for name in homestay.name if isinstance(name, str)]

plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
    background_color='white',
    width=2000,
    height=1000
).generate(" ".join(name_list))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()

e. Correlations¶

In [29]:
homestay.drop(['host_id',"latitude",'longitude'], axis=1, inplace=True)
homestay1 = pd.get_dummies(homestay,columns = ['neighbourhood_group','room_type'],drop_first=True)
homestay1.drop(["neighbourhood"], axis=1, inplace=True)
In [30]:
corr = homestay1.corr(method='kendall', numeric_only=True)
plt.figure(figsize = (15, 15))
ax = sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, linewidth = 0.2, cmap = 'YlGnBu', annot = True, annot_kws={"fontsize":14}) ## annot:把值打出来,annot_kws: 标注字体大小
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 20) 
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 20) 
plt.title('Heatmap of Correlation Coefficient',fontsize = 25)
plt.show()

III. Modeling¶

a. Does homestay name imply any price information?¶

i. Preparations¶

In [31]:
homestay2 = homestay1.copy()
homestay2['name'].fillna('', inplace=True)
homestay2['name'].isnull().sum()
Out[31]:
0
In [32]:
def remove_punctuation_digits_specialchar(line):
    return re.sub('[^A-Za-z]+', ' ', line).lower()

homestay2['clean_name'] = homestay2['name'].apply(remove_punctuation_digits_specialchar)
homestay2[['name', 'clean_name']].tail()
Out[32]:
name clean_name
48890 Charming one bedroom - newly renovated rowhouse charming one bedroom newly renovated rowhouse
48891 Affordable room in Bushwick/East Williamsburg affordable room in bushwick east williamsburg
48892 Sunny Studio at Historical Neighborhood sunny studio at historical neighborhood
48893 43rd St. Time Square-cozy single bed rd st time square cozy single bed
48894 Trendy duplex in the very heart of Hell's Kitchen trendy duplex in the very heart of hell s kitchen
In [33]:
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords

def tokenize_no_stopwords(line):
    tokens = nltk.tokenize.word_tokenize(line)
    tokens_no_stop = [w for w in tokens if w not in stopwords.words('english')]
    return " ".join(tokens_no_stop)
homestay2['final_name'] = homestay2['clean_name'].apply(tokenize_no_stopwords)
homestay2[['name', 'clean_name', 'final_name']].head()
Out[33]:
name clean_name final_name
0 Clean & quiet apt home by the park clean quiet apt home by the park clean quiet apt home park
1 Skylit Midtown Castle skylit midtown castle skylit midtown castle
2 THE VILLAGE OF HARLEM....NEW YORK ! the village of harlem new york village harlem new york
3 Cozy Entire Floor of Brownstone cozy entire floor of brownstone cozy entire floor brownstone
4 Entire Apt: Spacious Studio/Loft by central park entire apt spacious studio loft by central park entire apt spacious studio loft central park
In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMRegressor, LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, classification_report, mean_absolute_error, r2_score
from imblearn.over_sampling import SMOTE

def classify_price_category(price):
    if price > 300:
        return 2
    elif price > 100:
        return 1
    else:
        return 0

homestay2['target'] = homestay2['price'].apply(classify_price_category)
homestay2['target'].value_counts()
train, test = train_test_split(homestay2, test_size=0.1, random_state=4, stratify=homestay2['target'])

X_train, y_train = train['final_name'], train['target']
X_test, y_test = test['final_name'], test['target']

vect = TfidfVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

# over-sampling
smote = SMOTE(random_state=4)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
In [35]:
homestay2['target'].value_counts()
Out[35]:
0    23928
1    21610
2     1594
Name: target, dtype: int64
In [36]:
y_train_resampled_value_counts = y_train_resampled.value_counts()
print("SMOTE applied target distribution: ", y_train_resampled_value_counts)
SMOTE applied target distribution:  1    21535
0    21535
2    21535
Name: target, dtype: int64

ii. Decision tree classification¶

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report


dt = DecisionTreeClassifier(random_state=4)
dt.fit(X_train_resampled, y_train_resampled)
preds = dt.predict(X_test)

# confusion matrix
cm = confusion_matrix(y_test, preds)
label_map = {'0': 'low', '1': 'medium', '2': 'high'}
cm_headers = [f"actual {label_map[str(i)]}" for i in range(cm.shape[0])]
cm_columns = [f"predicted {label_map[str(i)]}" for i in range(cm.shape[1])]
cm = pd.DataFrame(cm, columns=cm_columns, index=cm_headers)
print("Confusion matrix:")
print(cm.astype(int))

# classification report
cr = classification_report(y_test, preds, target_names=label_map.values(), digits=3)

print("\nClassification report:")
print(cr)
Confusion matrix:
               predicted low  predicted medium  predicted high
actual low              1655               700              38
actual medium            651              1365             145
actual high               31                94              35

Classification report:
              precision    recall  f1-score   support

         low      0.708     0.692     0.700      2393
      medium      0.632     0.632     0.632      2161
        high      0.161     0.219     0.185       160

    accuracy                          0.648      4714
   macro avg      0.500     0.514     0.506      4714
weighted avg      0.655     0.648     0.651      4714

ii. Light gradient boosting machine classification¶

In [38]:
lr = LGBMClassifier(random_state=4)
lr.fit(X_train_resampled, y_train_resampled)
preds = lr.predict(X_test)

# confusion matrix
cm = confusion_matrix(y_test, preds)
label_map = {'0': 'low', '1': 'medium', '2': 'high'}
cm_headers = [f"actual {label_map[str(i)]}" for i in range(cm.shape[0])]
cm_columns = [f"predicted {label_map[str(i)]}" for i in range(cm.shape[1])]
cm = pd.DataFrame(cm, columns=cm_columns, index=cm_headers)
print("Confusion matrix:")
print(cm.astype(int))

# classification report
cr = classification_report(y_test, preds, target_names=label_map.values(), digits=3)
print("\nClassification report:")
print(cr)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59554
[LightGBM] [Info] Number of data points in the train set: 64605, number of used features: 960
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Confusion matrix:
               predicted low  predicted medium  predicted high
actual low              1754               607              32
actual medium            551              1491             119
actual high               26                94              40

Classification report:
              precision    recall  f1-score   support

         low      0.752     0.733     0.743      2393
      medium      0.680     0.690     0.685      2161
        high      0.209     0.250     0.228       160

    accuracy                          0.697      4714
   macro avg      0.547     0.558     0.552      4714
weighted avg      0.701     0.697     0.699      4714

b. How to price?¶

i. Linear regression¶

In [39]:
homestay1.drop(['name'], axis=1, inplace=True)
In [40]:
X = homestay1.loc[:,homestay1.columns != 'price']
y = homestay1['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)
In [41]:
linreg = LinearRegression().fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print('R² = ',linreg.score(X_train, y_train).round(3))
print ('RMSE = %.3f'%np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
R² =  0.438
RMSE = 60.241
In [42]:
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred)),y_pred-y_test,'lightpink')
plt.plot(range(len(y_pred)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Linear Regression',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
In [43]:
# estimated - actual price plot
test_pred = pd.DataFrame(y_test)
test_pred['esti_price'] = y_pred

test_pred.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatter_LR',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
plt.xlim = ([0,400])
plt.ylim = ([0,500])

ii. Lasso regression¶

In [44]:
homestay3 = homestay.copy()
homestay3.drop(['name'], axis=1, inplace=True)
homestay3 = pd.get_dummies(homestay3, columns=['neighbourhood_group', 'neighbourhood', 'room_type'], drop_first=True)

X1 = homestay3.loc[:, homestay3.columns != 'price']
y1 = homestay3['price']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.1, random_state=4)

Lassoreg = Lasso(alpha=0.01)
Lassoreg.fit(X_train1, y_train1)
Out[44]:
Lasso(alpha=0.01)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Lasso(alpha=0.01)
In [45]:
coef_dict = dict(zip(X_train1.columns, Lassoreg.coef_.round(3)))
sorted(coef_dict.items(),key = lambda x:x[1],reverse = True)
Out[45]:
[('neighbourhood_DUMBO', 75.086),
 ('neighbourhood_group_Manhattan', 64.846),
 ('neighbourhood_Tribeca', 56.408),
 ('neighbourhood_Vinegar Hill', 52.385),
 ('neighbourhood_NoHo', 37.058),
 ('neighbourhood_Boerum Hill', 36.569),
 ('neighbourhood_Downtown Brooklyn', 35.689),
 ('neighbourhood_Park Slope', 34.326),
 ('neighbourhood_Williamsburg', 33.877),
 ('neighbourhood_Cobble Hill', 33.682),
 ('neighbourhood_Midtown', 32.98),
 ('neighbourhood_Gowanus', 32.047),
 ('neighbourhood_Carroll Gardens', 29.884),
 ('neighbourhood_Brooklyn Heights', 29.879),
 ('neighbourhood_South Slope', 29.444),
 ('neighbourhood_Fort Greene', 28.481),
 ('neighbourhood_Flatiron District', 27.209),
 ('neighbourhood_Greenpoint', 25.317),
 ('neighbourhood_Prospect Heights', 24.843),
 ('neighbourhood_Theater District', 24.548),
 ('neighbourhood_West Village', 24.164),
 ('neighbourhood_Long Island City', 23.895),
 ('neighbourhood_Clinton Hill', 22.603),
 ('neighbourhood_Arverne', 21.059),
 ('neighbourhood_Greenwich Village', 20.044),
 ('neighbourhood_Chelsea', 19.712),
 ('neighbourhood_SoHo', 19.386),
 ('neighbourhood_Nolita', 18.854),
 ("neighbourhood_Hell's Kitchen", 13.83),
 ('neighbourhood_Windsor Terrace', 13.51),
 ('neighbourhood_group_Brooklyn', 13.026),
 ('neighbourhood_Red Hook', 9.032),
 ('neighbourhood_Breezy Point', 8.188),
 ('neighbourhood_group_Queens', 7.999),
 ('neighbourhood_Murray Hill', 7.202),
 ('neighbourhood_Columbia St', 7.011),
 ('neighbourhood_Astoria', 6.877),
 ('neighbourhood_Battery Park City', 6.643),
 ('neighbourhood_Gramercy', 5.376),
 ('neighbourhood_Financial District', 3.608),
 ('neighbourhood_Crown Heights', 3.023),
 ('neighbourhood_Elmhurst', 2.115),
 ('neighbourhood_Woodside', 1.374),
 ('neighbourhood_Ditmars Steinway', 1.03),
 ('neighbourhood_Bedford-Stuyvesant', 1.023),
 ('neighbourhood_Clason Point', 0.75),
 ('calculated_host_listings_count', 0.124),
 ('availability_365', 0.088),
 ('neighbourhood_Forest Hills', 0.057),
 ('neighbourhood_Arden Heights', -0.0),
 ('neighbourhood_Arrochar', -0.0),
 ('neighbourhood_Bath Beach', -0.0),
 ('neighbourhood_Bay Terrace', 0.0),
 ('neighbourhood_Bay Terrace, Staten Island', -0.0),
 ('neighbourhood_Baychester', -0.0),
 ('neighbourhood_Bayside', 0.0),
 ('neighbourhood_Bayswater', -0.0),
 ('neighbourhood_Belle Harbor', 0.0),
 ('neighbourhood_Bellerose', 0.0),
 ('neighbourhood_Belmont', 0.0),
 ('neighbourhood_Bergen Beach', -0.0),
 ('neighbourhood_Briarwood', 0.0),
 ("neighbourhood_Bull's Head", 0.0),
 ('neighbourhood_Bushwick', 0.0),
 ('neighbourhood_Cambria Heights', 0.0),
 ('neighbourhood_Castle Hill', -0.0),
 ('neighbourhood_Castleton Corners', 0.0),
 ('neighbourhood_City Island', -0.0),
 ('neighbourhood_Civic Center', 0.0),
 ('neighbourhood_Claremont Village', 0.0),
 ('neighbourhood_Clifton', -0.0),
 ('neighbourhood_Co-op City', 0.0),
 ('neighbourhood_College Point', -0.0),
 ('neighbourhood_Concord', -0.0),
 ('neighbourhood_Concourse', 0.0),
 ('neighbourhood_Concourse Village', -0.0),
 ('neighbourhood_Coney Island', 0.0),
 ('neighbourhood_Dongan Hills', -0.0),
 ('neighbourhood_Douglaston', -0.0),
 ('neighbourhood_Dyker Heights', -0.0),
 ('neighbourhood_East Elmhurst', -0.0),
 ('neighbourhood_East Morrisania', -0.0),
 ('neighbourhood_East Village', -0.0),
 ('neighbourhood_Eastchester', 0.0),
 ('neighbourhood_Edenwald', -0.0),
 ('neighbourhood_Edgemere', 0.0),
 ('neighbourhood_Eltingville', 0.0),
 ('neighbourhood_Emerson Hill', -0.0),
 ('neighbourhood_Far Rockaway', -0.0),
 ('neighbourhood_Fieldston', -0.0),
 ('neighbourhood_Flushing', 0.0),
 ('neighbourhood_Fordham', 0.0),
 ('neighbourhood_Fresh Meadows', 0.0),
 ('neighbourhood_Glendale', -0.0),
 ('neighbourhood_Graniteville', -0.0),
 ('neighbourhood_Grant City', -0.0),
 ('neighbourhood_Great Kills', 0.0),
 ('neighbourhood_Grymes Hill', 0.0),
 ('neighbourhood_Highbridge', -0.0),
 ('neighbourhood_Hollis', 0.0),
 ('neighbourhood_Holliswood', 0.0),
 ('neighbourhood_Howard Beach', 0.0),
 ('neighbourhood_Howland Hook', -0.0),
 ('neighbourhood_Huguenot', -0.0),
 ('neighbourhood_Hunts Point', -0.0),
 ('neighbourhood_Jamaica Estates', 0.0),
 ('neighbourhood_Jamaica Hills', 0.0),
 ('neighbourhood_Kew Gardens', 0.0),
 ('neighbourhood_Kew Gardens Hills', 0.0),
 ('neighbourhood_Kingsbridge', 0.0),
 ('neighbourhood_Laurelton', -0.0),
 ('neighbourhood_Lighthouse Hill', 0.0),
 ('neighbourhood_Little Italy', 0.0),
 ('neighbourhood_Little Neck', 0.0),
 ('neighbourhood_Longwood', 0.0),
 ('neighbourhood_Manhattan Beach', -0.0),
 ('neighbourhood_Mariners Harbor', 0.0),
 ('neighbourhood_Melrose', -0.0),
 ('neighbourhood_Middle Village', -0.0),
 ('neighbourhood_Midland Beach', -0.0),
 ('neighbourhood_Mill Basin', -0.0),
 ('neighbourhood_Morris Heights', 0.0),
 ('neighbourhood_Morris Park', -0.0),
 ('neighbourhood_Morrisania', 0.0),
 ('neighbourhood_Mott Haven', -0.0),
 ('neighbourhood_Mount Eden', -0.0),
 ('neighbourhood_Mount Hope', -0.0),
 ('neighbourhood_Navy Yard', 0.0),
 ('neighbourhood_Neponsit', 0.0),
 ('neighbourhood_New Brighton', 0.0),
 ('neighbourhood_New Dorp', -0.0),
 ('neighbourhood_New Dorp Beach', -0.0),
 ('neighbourhood_New Springville', -0.0),
 ('neighbourhood_North Riverdale', 0.0),
 ('neighbourhood_Norwood', 0.0),
 ('neighbourhood_Oakwood', -0.0),
 ('neighbourhood_Olinville', 0.0),
 ('neighbourhood_Parkchester', 0.0),
 ('neighbourhood_Pelham Bay', -0.0),
 ('neighbourhood_Port Morris', 0.0),
 ('neighbourhood_Port Richmond', 0.0),
 ("neighbourhood_Prince's Bay", 0.0),
 ('neighbourhood_Randall Manor', -0.0),
 ('neighbourhood_Rego Park', -0.0),
 ('neighbourhood_Richmondtown', -0.0),
 ('neighbourhood_Riverdale', 0.0),
 ('neighbourhood_Rockaway Beach', 0.0),
 ('neighbourhood_Rosebank', -0.0),
 ('neighbourhood_Rossville', -0.0),
 ('neighbourhood_Schuylerville', 0.0),
 ('neighbourhood_Sea Gate', 0.0),
 ('neighbourhood_Shore Acres', 0.0),
 ('neighbourhood_Silver Lake', 0.0),
 ('neighbourhood_Soundview', -0.0),
 ('neighbourhood_South Beach', 0.0),
 ('neighbourhood_South Ozone Park', -0.0),
 ('neighbourhood_Springfield Gardens', 0.0),
 ('neighbourhood_Spuyten Duyvil', 0.0),
 ('neighbourhood_St. Albans', 0.0),
 ('neighbourhood_St. George', 0.0),
 ('neighbourhood_Stapleton', 0.0),
 ('neighbourhood_Stuyvesant Town', -0.0),
 ('neighbourhood_Sunnyside', -0.0),
 ('neighbourhood_Throgs Neck', 0.0),
 ('neighbourhood_Todt Hill', 0.0),
 ('neighbourhood_Tompkinsville', -0.0),
 ('neighbourhood_Tottenville', 0.0),
 ('neighbourhood_Tremont', -0.0),
 ('neighbourhood_Unionport', -0.0),
 ('neighbourhood_University Heights', 0.0),
 ('neighbourhood_Van Nest', 0.0),
 ('neighbourhood_Wakefield', 0.0),
 ('neighbourhood_West Brighton', 0.0),
 ('neighbourhood_West Farms', 0.0),
 ('neighbourhood_Westchester Square', -0.0),
 ('neighbourhood_Westerleigh', -0.0),
 ('neighbourhood_Whitestone', 0.0),
 ('neighbourhood_Williamsbridge', -0.0),
 ('neighbourhood_Willowbrook', 0.0),
 ('neighbourhood_Woodlawn', -0.0),
 ('number_of_reviews', -0.085),
 ('neighbourhood_Richmond Hill', -0.11),
 ('neighbourhood_Sunset Park', -0.277),
 ('neighbourhood_Jackson Heights', -0.446),
 ('neighbourhood_Jamaica', -0.629),
 ('neighbourhood_Upper West Side', -0.67),
 ('neighbourhood_Ridgewood', -0.697),
 ('neighbourhood_group_Staten Island', -0.965),
 ('reviews_per_month', -1.219),
 ('minimum_nights', -1.245),
 ('neighbourhood_Rosedale', -2.026),
 ('neighbourhood_Gravesend', -2.042),
 ('neighbourhood_Bronxdale', -2.558),
 ('neighbourhood_Kensington', -2.774),
 ('neighbourhood_Bay Ridge', -2.89),
 ('neighbourhood_Fort Hamilton', -3.278),
 ('neighbourhood_Lower East Side', -3.599),
 ('neighbourhood_Kips Bay', -4.119),
 ('neighbourhood_Prospect-Lefferts Gardens', -4.589),
 ('neighbourhood_Brighton Beach', -4.689),
 ('neighbourhood_Brownsville', -5.238),
 ('neighbourhood_Maspeth', -5.756),
 ('neighbourhood_Sheepshead Bay', -6.12),
 ('neighbourhood_Two Bridges', -6.6),
 ('neighbourhood_East Flatbush', -7.486),
 ('neighbourhood_Chinatown', -7.671),
 ('neighbourhood_Cypress Hills', -7.989),
 ('neighbourhood_Pelham Gardens', -8.962),
 ('neighbourhood_Flatlands', -9.554),
 ('neighbourhood_Flatbush', -10.258),
 ('neighbourhood_Upper East Side', -10.938),
 ('neighbourhood_Woodhaven', -11.613),
 ('neighbourhood_Bensonhurst', -11.837),
 ('neighbourhood_Midwood', -12.191),
 ('neighbourhood_Corona', -12.343),
 ('neighbourhood_Canarsie', -13.17),
 ('neighbourhood_Queens Village', -13.415),
 ('neighbourhood_Ozone Park', -13.815),
 ('neighbourhood_East New York', -15.265),
 ('neighbourhood_Borough Park', -16.088),
 ('neighbourhood_Marble Hill', -22.158),
 ('neighbourhood_Roosevelt Island', -26.258),
 ('neighbourhood_East Harlem', -27.841),
 ('neighbourhood_Morningside Heights', -34.252),
 ('neighbourhood_Harlem', -35.712),
 ('neighbourhood_Washington Heights', -51.604),
 ('neighbourhood_Inwood', -55.222),
 ('room_type_Private room', -80.018),
 ('room_type_Shared room', -103.583)]
In [46]:
number_of_key_value_pairs = len(coef_dict)

print(f"The dictionary has {number_of_key_value_pairs} key-value pairs.")
The dictionary has 229 key-value pairs.
In [47]:
print('R² = ',Lassoreg.score(X_train1, y_train1).round(3))
y_pred1= Lassoreg.predict(X_test1)
print ('RMSE1 =%.3f'%np.sqrt(metrics.mean_squared_error(y_test1, y_pred1))) # Root Mean Square Error
R² =  0.501
RMSE1 =56.840
In [48]:
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred1)),y_pred1-y_test1,'lightpink')
plt.plot(range(len(y_pred1)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Lasso Regression',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
In [49]:
# estimated - actual price scatter plot
y_pred1 = Lassoreg.predict(X_test1)
test_pred1 = pd.DataFrame(y_test1)
test_pred1['esti_price1'] = y_pred1

test_pred1.plot(x='price',y='esti_price1',kind='scatter',fontsize = 10)
plt.title('test_pred_scatter_Lasso',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
# distributed closer to 45° line

iii. Decision tree¶

In [50]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import random

random.seed(4)
np.random.seed(4)

param_grid = {"ccp_alpha": np.linspace(0, 1, 11)}
reg_tree = DecisionTreeRegressor()

# Pruning
grid_search = GridSearchCV(reg_tree, param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
grid_search.fit(X_train1, y_train1)
print("Scores for each parameter combination:")
means = -grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("CCP Alpha: %0.3f, RMSE: %0.3f (+/- %0.3f)" % (params['ccp_alpha'], np.sqrt(mean), std))

best_alpha = grid_search.best_params_['ccp_alpha']
print("best alpha: ", best_alpha)
Scores for each parameter combination:
CCP Alpha: 0.000, RMSE: 71.641 (+/- 144.632)
CCP Alpha: 0.100, RMSE: 68.462 (+/- 142.786)
CCP Alpha: 0.200, RMSE: 66.100 (+/- 156.040)
CCP Alpha: 0.300, RMSE: 63.734 (+/- 152.811)
CCP Alpha: 0.400, RMSE: 61.534 (+/- 157.203)
CCP Alpha: 0.500, RMSE: 59.850 (+/- 151.028)
CCP Alpha: 0.600, RMSE: 58.685 (+/- 157.464)
CCP Alpha: 0.700, RMSE: 57.309 (+/- 131.820)
CCP Alpha: 0.800, RMSE: 56.392 (+/- 122.917)
CCP Alpha: 0.900, RMSE: 55.851 (+/- 110.634)
CCP Alpha: 1.000, RMSE: 55.752 (+/- 102.677)
best alpha:  1.0
In [51]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Fit using the best alpha
feature_names = list(X_train1.columns)
best_reg_tree = DecisionTreeRegressor(ccp_alpha=best_alpha, random_state=4)
best_reg_tree.fit(X_train1, y_train1)
plt.figure(figsize=(40, 20))

# set max_depth = 3
plot_tree(best_reg_tree, filled=True, feature_names=feature_names, class_names=['price'],
           rounded=True, precision=2, impurity=True, node_ids=True, proportion=False,
           fontsize=18, label='all', max_depth=3)  
In [52]:
y_pred1 = best_reg_tree.predict(X_test1)

mse = mean_squared_error(y_test1, y_pred1)
rmse = np.sqrt(mse)
print("RMSE =  ", round(rmse, 3))
print('R² = ', best_reg_tree.score(X_train1, y_train1).round(3))
RMSE =   56.828
R² =  0.568

iv. Random forest¶

In [53]:
regrf = RandomForestRegressor(n_estimators=300) 
regrf.fit(X_train1, y_train1)
Out[53]:
RandomForestRegressor(n_estimators=300)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(n_estimators=300)
In [54]:
# Feature importance
importances = regrf.feature_importances_
weights = pd.Series(importances,
                 index=X_train1.columns.values)
print(weights.sort_values(ascending = False).round(3))

plt.figure(figsize=(15,10))
plt.title('Feature Importances',fontsize = 20)
weights.sort_values()[-10:].plot(kind = 'barh')
plt.xlabel('Relative Importance',fontsize = 15)
room_type_Private room        0.316
availability_365              0.111
reviews_per_month             0.105
number_of_reviews             0.081
minimum_nights                0.070
                              ...  
neighbourhood_Richmondtown    0.000
neighbourhood_Westerleigh     0.000
neighbourhood_Rossville       0.000
neighbourhood_Co-op City      0.000
neighbourhood_Silver Lake     0.000
Length: 229, dtype: float64
Out[54]:
Text(0.5, 0, 'Relative Importance')
In [55]:
print('R² = ',regrf.score(X_train1, y_train1).round(3))
y_predrf= regrf.predict(X_test1)
print('RMSE = ',np.sqrt(metrics.mean_squared_error(y_test1,y_predrf)).round(3))
# Coefficient of determinationR² is up to 89.6%,too high to overfitting
R² =  0.896
RMSE =  55.668
In [56]:
# Difference between test and pred
test_predrf = pd.DataFrame(y_test)
test_predrf['esti_price'] = y_predrf

plt.figure(figsize = (40,20))
plt.plot(range(len(y_predrf)),y_predrf-y_test1,'lightpink')
plt.plot(range(len(y_predrf)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Random Forest Regression before Selecting Parameters',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
In [57]:
# estimated - actual price scatter plot
test_predrf.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatterrf',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
In [58]:
# Tuning Parameters - Random Search
# Step1: Creating a Hyperparameter Grid
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 6)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
rm_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(rm_grid)
{'n_estimators': [200, 400, 600, 800, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 30, 50, 70, 90, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
In [59]:
# Step2&3: 3-fold cross-validation with random search to find the corresponding parameters with the highest score

# It will take hours to carry out  this validation
rf1 = RandomForestRegressor()
# 3-fold cross-validation
rf1_random = RandomizedSearchCV(estimator = rf1, param_distributions = rm_grid, n_iter = 180, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf1_random.fit(X_train1, y_train1)
print(rf2_random.best_params_) # best parametre
Out[59]:
'# Step2&3: 3-fold cross-validation with random search to find the corresponding parameters with the highest score\n\n# It will take hours to carry out  this validation\nrf1 = RandomForestRegressor()\n# 3-fold cross-validation\nrf1_random = RandomizedSearchCV(estimator = rf1, param_distributions = rm_grid, n_iter = 180, cv = 3, verbose=2, random_state=42, n_jobs = -1)\nrf1_random.fit(X_train1, y_train1)\nprint(rf2_random.best_params_) # best parametre'
In [60]:
# rain a random forest model by using the optimal parameterst
regrf1 = RandomForestRegressor(n_estimators=400, max_depth = 110, min_samples_split = 10,min_samples_leaf =1,max_features = 'sqrt',bootstrap = True)
regrf1.fit(X_train1, y_train1)
Out[60]:
RandomForestRegressor(max_depth=110, max_features='sqrt', min_samples_split=10,
                      n_estimators=400)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(max_depth=110, max_features='sqrt', min_samples_split=10,
                      n_estimators=400)
In [61]:
# R² decreased and risk of overfitting declined
print('R² = ',regrf1.score(X_train1, y_train1).round(3))
R² =  0.737
In [62]:
# feature importance again
importances1 = regrf1.feature_importances_
weights1 = pd.Series(importances1,
                 index=X_train1.columns.values)
print(weights1.sort_values(ascending = False).round(3))

plt.figure(figsize=(15,10))
plt.title('Feature Importances1',fontsize = 20)
weights1.sort_values()[-10:].plot(kind = 'barh')
plt.xlabel('Relative Importance1',fontsize = 15)
room_type_Private room            0.322
availability_365                  0.090
reviews_per_month                 0.076
calculated_host_listings_count    0.063
number_of_reviews                 0.063
                                  ...  
neighbourhood_Oakwood             0.000
neighbourhood_Rossville           0.000
neighbourhood_Todt Hill           0.000
neighbourhood_Co-op City          0.000
neighbourhood_Silver Lake         0.000
Length: 229, dtype: float64
Out[62]:
Text(0.5, 0, 'Relative Importance1')
In [63]:
y_predrf1 = regrf1.predict(X_test1)
print('RMSE = ',np.sqrt(metrics.mean_squared_error(y_test1,y_predrf1)).round(3))
RMSE =  54.099
In [64]:
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_predrf1)),y_predrf1-y_test1,'lightpink')
plt.plot(range(len(y_predrf1)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Random Forest Regression after Selecting Parameters',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
In [65]:
# estimated - actual price scatter plot
test_predrf1 = pd.DataFrame(y_test)
test_predrf1['esti_price'] = y_predrf1

test_predrf1.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatterrf1',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')

v. Neural network¶

In [66]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, r2_score
# primary
model = keras.Sequential([
    layers.Dense(50, activation='relu', input_shape=[229]),
    layers.Dense(50, activation='relu'),
    layers.Dense(1)
])

model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))

history = model.fit(X_train1, y_train1, validation_split=0.1, epochs= 50, batch_size=64)

loss = model.evaluate(X_test1, y_test1)


y_pred1 = model.predict(X_test1)
y_pred1 = y_pred1.flatten()
RMSE = np.sqrt(mean_squared_error(y_test1, y_pred1))
r2 = r2_score(y_test1, y_pred1)
print ('RMSE = %.3f'%RMSE) 
print ('R² = %.3f'%r2) 
C:\Users\Tommy\AppData\Roaming\Python\Python311\site-packages\keras\src\layers\core\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Epoch 1/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 14270.0303 - val_loss: 5613.9502
Epoch 2/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 4309.5264 - val_loss: 3465.5352
Epoch 3/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3139.7681 - val_loss: 3279.2512
Epoch 4/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3019.5059 - val_loss: 3204.2534
Epoch 5/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3063.4666 - val_loss: 3174.3452
Epoch 6/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3002.1589 - val_loss: 3181.0767
Epoch 7/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2952.7341 - val_loss: 3184.8140
Epoch 8/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2867.6155 - val_loss: 3159.6880
Epoch 9/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2906.6179 - val_loss: 3325.4902
Epoch 10/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2967.6970 - val_loss: 3297.1353
Epoch 11/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2940.2827 - val_loss: 3313.5857
Epoch 12/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2933.9275 - val_loss: 3233.2585
Epoch 13/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2869.5593 - val_loss: 3183.4407
Epoch 14/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2917.6016 - val_loss: 3132.5483
Epoch 15/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2918.0779 - val_loss: 3145.6440
Epoch 16/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2947.0562 - val_loss: 3223.9087
Epoch 17/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2941.7246 - val_loss: 3189.1008
Epoch 18/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2862.6282 - val_loss: 3486.5457
Epoch 19/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2914.2515 - val_loss: 3173.9697
Epoch 20/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2920.6282 - val_loss: 3261.3159
Epoch 21/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2940.2378 - val_loss: 3168.7754
Epoch 22/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2815.1086 - val_loss: 3176.0923
Epoch 23/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2807.4758 - val_loss: 3061.6084
Epoch 24/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2820.5669 - val_loss: 3159.4500
Epoch 25/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2801.5427 - val_loss: 3083.7334
Epoch 26/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2807.3088 - val_loss: 3272.0383
Epoch 27/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2834.4709 - val_loss: 3087.1653
Epoch 28/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2926.2922 - val_loss: 3067.8433
Epoch 29/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2761.8306 - val_loss: 3082.7502
Epoch 30/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2762.4680 - val_loss: 3258.2122
Epoch 31/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2799.4119 - val_loss: 3072.9504
Epoch 32/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2794.4707 - val_loss: 3064.9441
Epoch 33/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2804.5110 - val_loss: 3184.1548
Epoch 34/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2897.9736 - val_loss: 3063.5586
Epoch 35/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2755.7126 - val_loss: 3151.7876
Epoch 36/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2792.2480 - val_loss: 3129.2773
Epoch 37/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2785.3416 - val_loss: 3168.3606
Epoch 38/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2851.5212 - val_loss: 3106.2759
Epoch 39/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2820.0664 - val_loss: 3070.3213
Epoch 40/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2810.0220 - val_loss: 3077.5725
Epoch 41/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2842.0059 - val_loss: 3117.4302
Epoch 42/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2793.7520 - val_loss: 3098.4836
Epoch 43/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2829.1147 - val_loss: 3086.9243
Epoch 44/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2764.5374 - val_loss: 3054.6953
Epoch 45/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2778.7917 - val_loss: 3192.8110
Epoch 46/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2779.3521 - val_loss: 3049.4778
Epoch 47/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2741.2744 - val_loss: 3154.0405
Epoch 48/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2784.9775 - val_loss: 3116.7498
Epoch 49/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2727.0957 - val_loss: 3049.5222
Epoch 50/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2732.6794 - val_loss: 3051.8562
148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 998us/step - loss: 2932.3076
148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
RMSE = 55.133
R² = 0.529
In [67]:
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred1)),y_pred1-y_test1,'lightpink')
plt.plot(range(len(y_pred1)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of Neutral Network',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
In [68]:
# estimated - actual price scatter plot
test_prednn = pd.DataFrame(y_test)
test_prednn['esti_price'] = y_pred1

test_prednn.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatternn',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')
In [69]:
# L1 regularization
l1_regularization_strength = 0.01

# primary
model = keras.Sequential([
    layers.Dense(50, activation='relu', input_shape=[229],
                  kernel_regularizer=tf.keras.regularizers.l1(l1_regularization_strength)),
    layers.Dense(50, activation='relu',
                  kernel_regularizer=tf.keras.regularizers.l1(l1_regularization_strength)),
    layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l1(l1_regularization_strength))
])

model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.0001))

history = model.fit(X_train1, y_train1, validation_split=0.1, epochs=50, batch_size=64)

loss = model.evaluate(X_test1, y_test1)

y_pred1 = model.predict(X_test1)
y_pred1 = y_pred1.flatten()
RMSE = np.sqrt(mean_squared_error(y_test1, y_pred1))
r2 = r2_score(y_test1, y_pred1)
print('RMSE = %.3f' % RMSE)
print('R² = %.3f' % r2)
Epoch 1/50
C:\Users\Tommy\AppData\Roaming\Python\Python311\site-packages\keras\src\layers\core\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
597/597 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 20542.7969 - val_loss: 13567.7109
Epoch 2/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 12837.7803 - val_loss: 12305.3818
Epoch 3/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 11504.5684 - val_loss: 10999.7314
Epoch 4/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 10191.1875 - val_loss: 10080.3906
Epoch 5/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 9504.1758 - val_loss: 9165.3750
Epoch 6/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 8650.1934 - val_loss: 8036.6064
Epoch 7/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 7337.7583 - val_loss: 6868.9292
Epoch 8/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 6265.2163 - val_loss: 5882.5054
Epoch 9/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 5396.6641 - val_loss: 5141.2520
Epoch 10/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 4748.8843 - val_loss: 4587.0337
Epoch 11/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 4213.9956 - val_loss: 4259.2466
Epoch 12/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3891.1885 - val_loss: 3911.1287
Epoch 13/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3635.6692 - val_loss: 3693.4150
Epoch 14/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3477.2400 - val_loss: 3545.7544
Epoch 15/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3279.8562 - val_loss: 3465.7449
Epoch 16/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3169.5723 - val_loss: 3388.4314
Epoch 17/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3141.9705 - val_loss: 3352.7188
Epoch 18/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3109.7812 - val_loss: 3415.8994
Epoch 19/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3110.6162 - val_loss: 3317.2009
Epoch 20/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3038.2485 - val_loss: 3306.9614
Epoch 21/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3085.8132 - val_loss: 3379.8074
Epoch 22/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2989.7073 - val_loss: 3271.5540
Epoch 23/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3064.6990 - val_loss: 3263.1897
Epoch 24/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3064.9290 - val_loss: 3243.4165
Epoch 25/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3045.0242 - val_loss: 3247.7808
Epoch 26/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 3024.4351 - val_loss: 3236.9470
Epoch 27/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2963.3347 - val_loss: 3232.5823
Epoch 28/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 3030.8843 - val_loss: 3227.7363
Epoch 29/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2977.7139 - val_loss: 3252.9941
Epoch 30/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - loss: 2977.2830 - val_loss: 3257.7603
Epoch 31/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2924.6086 - val_loss: 3198.8594
Epoch 32/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2887.2996 - val_loss: 3202.9199
Epoch 33/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2913.2458 - val_loss: 3248.2896
Epoch 34/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2984.6938 - val_loss: 3234.5540
Epoch 35/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2901.1604 - val_loss: 3219.9175
Epoch 36/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2952.9514 - val_loss: 3185.1863
Epoch 37/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2967.8337 - val_loss: 3178.2781
Epoch 38/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2943.8218 - val_loss: 3179.9934
Epoch 39/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2896.6150 - val_loss: 3168.2979
Epoch 40/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2914.4419 - val_loss: 3169.5037
Epoch 41/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2969.8904 - val_loss: 3163.6829
Epoch 42/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2937.4021 - val_loss: 3169.7246
Epoch 43/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2891.6396 - val_loss: 3168.5266
Epoch 44/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2886.7256 - val_loss: 3207.7920
Epoch 45/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2887.6250 - val_loss: 3164.1267
Epoch 46/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2905.2673 - val_loss: 3169.6577
Epoch 47/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - loss: 2883.9983 - val_loss: 3159.8779
Epoch 48/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2906.4841 - val_loss: 3167.1526
Epoch 49/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2949.9744 - val_loss: 3164.5361
Epoch 50/50
597/597 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - loss: 2904.4216 - val_loss: 3165.9866
148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 956us/step - loss: 3010.7915
148/148 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step  
RMSE = 55.586
R² = 0.521
In [70]:
# Difference between test and pred
plt.figure(figsize = (40,20))
plt.plot(range(len(y_pred)),y_pred-y_test1,'lightpink')
plt.plot(range(len(y_pred)),np.zeros(4714),'black')
plt.ylim = (-300,200)
plt.title('Difference Between Test and Pred of L1 regularization Neutral Network',fontsize = 50)
plt.xlabel('No. of test',fontsize = 50)
plt.ylabel('price_difference',fontsize = 50)
In [71]:
# estimated - actual price scatter plot
test_prednn1 = pd.DataFrame(y_test1)
test_prednn1['esti_price'] = y_pred1

test_prednn1.plot(x='price',y='esti_price',kind='scatter',fontsize = 10)
plt.title('test_pred_scatternn1',fontsize = 20)
x = np.linspace(0,250)
y=x
plt.plot(x,y,'r')